library(data.table)
library(ape)
library(phylobase)
library(phytools)
library(phangorn)
library(stringr)
library(geiger)
library(RSvgDevice)
library(reshape2)
library(ggplot2)
# Annotate branches function. This is used to label a particular branch with
# something... #
annotate_transfer_num <- function(edge_entry, index, colName) {
edge <- edge_entry[1:2]
edge_label <- as.character(round(edge_entry[, colName], digits = 2))
# print(edge_label)
edgelabels(edge_label, index, adj = c(0.5, -0.25), bg = "white", frame = "none",
cex = 1)
}
# Plot particular branches on the phylogeny #
PlotSegmentByNodes <- function(nodes0, nodes1, colour) {
lastPP <- get("last_plot.phylo", envir = .PlotPhyloEnv)
if (length(nodes0) != length(nodes1)) {
tmp <- cbind(nodes0, nodes1)
nodes0 <- tmp[, 1]
nodes1 <- tmp[, 2]
}
x0 <- lastPP$xx[nodes0]
y0 <- lastPP$yy[nodes0]
x1 <- lastPP$xx[nodes1]
y1 <- lastPP$yy[nodes1]
segments(x0, y0, x0, y1, lty = 1, lwd = 3, col = colour)
segments(x0, y1, x1, y1, lty = 1, lwd = 3, col = colour)
}
# Calculate all the relevant correlations for a given transfer column #
CalcCorrLengthTransfers <- function(full_df, transfer_column, outlier_index) {
full_no_outlier_df <- full_df[!(full_df$Index == outlier_index), ]
# Correlations - make df and fill it with different combinations #
cor_df <- data.frame(Condition = character(), Pearson = double(), Spearman = double(),
Rsquared = double(), stringsAsFactors = FALSE)
# All together - outlier kept #
cor_df <- rbind(cor_df, data.frame(Condition = "All with outlier", Pearson = cor(full_df$Branch_len,
full_df[, transfer_column], method = "pearson"), Spearman = cor(full_df$Branch_len,
full_df[, transfer_column], method = "spearman"), Rsquared = summary(lm(full_df$Branch_len ~
full_df[, transfer_column]))$r.squared))
# All together - outlier removed #
cor_df <- rbind(cor_df, data.frame(Condition = "All no outlier", Pearson = cor(full_no_outlier_df$Branch_len,
full_no_outlier_df[, transfer_column], method = "pearson"), Spearman = cor(full_no_outlier_df$Branch_len,
full_no_outlier_df[, transfer_column], method = "spearman"), Rsquared = summary(lm(full_no_outlier_df$Branch_len ~
full_no_outlier_df[, transfer_column]))$r.squared))
# Split by Subgroup # Intergroup #
cor_df <- rbind(cor_df, data.frame(Condition = "Intergroup no outlier",
Pearson = cor(full_no_outlier_df$Branch_len[!(full_no_outlier_df$Subgroup ==
TRUE)], full_no_outlier_df[, transfer_column][!(full_no_outlier_df$Subgroup ==
TRUE)], method = "pearson"), Spearman = cor(full_no_outlier_df$Branch_len[!(full_no_outlier_df$Subgroup ==
TRUE)], full_no_outlier_df[, transfer_column][!(full_no_outlier_df$Subgroup ==
TRUE)], method = "spearman"), Rsquared = summary(lm(full_no_outlier_df$Branch_len[!(full_no_outlier_df$Subgroup ==
TRUE)] ~ full_no_outlier_df[, transfer_column][!(full_no_outlier_df$Subgroup ==
TRUE)]))$r.squared))
# Intragroup #
cor_df <- rbind(cor_df, data.frame(Condition = "Intragroup no outlier",
Pearson = cor(full_no_outlier_df$Branch_len[!(full_no_outlier_df$Subgroup ==
FALSE)], full_no_outlier_df[, transfer_column][!(full_no_outlier_df$Subgroup ==
FALSE)], method = "pearson"), Spearman = cor(full_no_outlier_df$Branch_len[!(full_no_outlier_df$Subgroup ==
FALSE)], full_no_outlier_df[, transfer_column][!(full_no_outlier_df$Subgroup ==
FALSE)], method = "spearman"), Rsquared = summary(lm(full_no_outlier_df$Branch_len[!(full_no_outlier_df$Subgroup ==
FALSE)] ~ full_no_outlier_df[, transfer_column][!(full_no_outlier_df$Subgroup ==
FALSE)]))$r.squared))
cor_df[, 2:4] <- cbind(round(cor_df$Pearson, digits = 4), round(cor_df$Spearman,
digits = 4), round(cor_df$Rsquared, digits = 4))
return(cor_df)
}
## Variables used globally ##
penalties = c(4:6)
min_taxa = 0
single_pen_plot = TRUE
single_pen_val = 5
if (min_taxa > 4) {
file_name_add <- paste0("_min_", min_taxa)
} else {
file_name_add <- ""
}
Here we load and compare the phylogenies derived from the all-species cladogram in the Mowgli analysis and the concatenated vertical protein alignment (time-proxy). We want to take the transfers assigned to branches on the species cladogram and plot them on the time-resolved tree. For this have to map the corresponding branches from the species cladorgram to the time-resolved tree and identify any incongruences.
Isolate the Anoxybacillus/Geobacillus phylogeny used in the reconciliation analysis by deriving it from the all-species tree used by Mowgli.
species_tree_clado <- read.tree("/Users/aesin/Desktop/Mowgli/Species_tree/Reconciled_species_tree/outputSpeciesTree.mpr")
# Get the set of Geobacillus and Anoxybacillus tips and combine into a
# single set #
anoxy_geo_clado_tips <- grep("Geobacillus_|Anoxybacillus_", species_tree_clado$tip.label,
value = TRUE)
# Anoxy/Geobacillus tips used to extract the Anoxy/Geobacillus subclade.
# Transform to phylo4 object #
anoxy_geo_clado <- drop.tip(species_tree_clado, setdiff(species_tree_clado$tip.label,
anoxy_geo_clado_tips))
anoxy_geo_clado4 <- phylo4(anoxy_geo_clado)
# Get the node labels as a data frame ##
node_lbs_df <- data.frame(labels(anoxy_geo_clado4, "all"))
internal_nodes_species <- nodeId(anoxy_geo_clado4, "internal")
names(node_lbs_df)[1] <- "Labels"
plot.phylo(anoxy_geo_clado)
This is derived from a concatenated alignment of protein products of vertically inheritted genes. Here it is rooted by midpoint and the tips are relabelled to the tips used in Mowgli (i.e. the species tree above)
1. Read in the time-resolved tree
# Read in the time-resolved tree & root by midpoint #
time_consensus_tree <- read.tree(file = "/Users/aesin/Desktop/Geo_analysis/New_geobacillus_consensus/Tree/Final_consensus_out.txt")
anoxy_geo_time <- midpoint(time_consensus_tree)
plot.phylo(anoxy_geo_time)
2. Relabel the tips of the time resolved tree to match the species tree (Mowgli tips)
# Annotate the time-resolved tree with node labels imported from the species
# tree #
anoxy_geo_time_relab <- anoxy_geo_time
for (tip in anoxy_geo_time$tip.label) {
match_clado_name <- grep(tip, anoxy_geo_clado_tips, value = T)
time_tip_position <- match(tip, anoxy_geo_time_relab$tip.label)
# Replace with new name #
anoxy_geo_time_relab$tip.label[time_tip_position] <- match_clado_name
}
# Remove node labels (BS-value) so that mowgli labels can be added; convert
# to phylo4 object #
anoxy_geo_time_relab$node.label <- NULL
anoxy_geo_time_relab_4 <- phylo4(anoxy_geo_time_relab)
plot.phylo(anoxy_geo_time_relab)
association <- matrix(ncol = 2, nrow = 22)
association[, 1] <- association[, 2] <- anoxy_geo_time_relab$tip.label
cophylo_obj <- cophylo(anoxy_geo_clado, anoxy_geo_time_relab, assoc = association)
plot.cophylo(cophylo_obj)
internal_nodes_time <- nodeId(anoxy_geo_time_relab_4, "internal")
unmapped_edges <- vector(mode = "list")
unmapped_count = 1
## For each internal node, match it to the nodes from the species tree ##
for (internal_time in internal_nodes_time) {
matched = FALSE
for (internal_species in internal_nodes_species) {
if (setequal(tips(anoxy_geo_time_relab, internal_time), tips(anoxy_geo_clado,
internal_species)) == TRUE) {
## Get the label of the node from the species-cladogram tree ##
node_label <- anoxy_geo_clado4@label[internal_species]
## Index of the internal node label in the time-resolved tree ##
node_index <- which(names(nodeLabels(anoxy_geo_time_relab_4)) ==
internal_time)
## Assign the name of the node to the time-resolved tree ##
nodeLabels(anoxy_geo_time_relab_4)[node_index] <- node_label
cat(paste0("Time-resolved tree node: ", internal_time, " maps to species tree node: ",
internal_species, "\n"))
matched = TRUE
break
}
}
if (matched == FALSE) {
cat(paste0("Time tree node: ", internal_time, " did not map to any species tree node\n"))
edge_index <- which(anoxy_geo_time_relab_4@edge[, 2] == as.character(internal_time))
unmapped_edges[[unmapped_count]] <- as.vector(anoxy_geo_time_relab_4@edge[edge_index,
])
unmapped_count = unmapped_count + 1
}
}
## Time-resolved tree node: 23 maps to species tree node: 23
## Time tree node: 24 did not map to any species tree node
## Time-resolved tree node: 25 maps to species tree node: 33
## Time-resolved tree node: 26 maps to species tree node: 30
## Time-resolved tree node: 27 maps to species tree node: 26
## Time-resolved tree node: 28 maps to species tree node: 24
## Time-resolved tree node: 29 maps to species tree node: 27
## Time-resolved tree node: 30 maps to species tree node: 28
## Time-resolved tree node: 31 maps to species tree node: 29
## Time-resolved tree node: 32 maps to species tree node: 31
## Time-resolved tree node: 33 maps to species tree node: 32
## Time-resolved tree node: 34 maps to species tree node: 35
## Time-resolved tree node: 35 maps to species tree node: 36
## Time-resolved tree node: 36 maps to species tree node: 37
## Time-resolved tree node: 37 maps to species tree node: 39
## Time-resolved tree node: 38 maps to species tree node: 42
## Time-resolved tree node: 39 maps to species tree node: 43
## Time-resolved tree node: 40 maps to species tree node: 40
## Time-resolved tree node: 41 maps to species tree node: 41
## Time-resolved tree node: 42 maps to species tree node: 38
## Time tree node: 43 did not map to any species tree node
anoxy_geo_time_relab <- as(anoxy_geo_time_relab_4, "phylo")
plot(anoxy_geo_time_relab)
# Colour in the 'missing' incongruent edges #
for (i in 1:length(unmapped_edges)) {
edge <- unmapped_edges[[i]]
PlotSegmentByNodes(edge[1], edge[2], "red")
}
# Prepare the count table for the time-resolved tree as for the
# species-derived prune above ##
node_labels_dfs <- list(clado = data.frame(labels(anoxy_geo_clado4, "all")),
time = data.frame(labels(anoxy_geo_time_relab_4, "all")))
node_labels_dfs <- lapply(node_labels_dfs, setnames, "Node_labels")
lapply(node_labels_dfs, head)
## $clado
## Node_labels
## 1 Anoxybacillus_flavithermus_WK1_3602
## 2 Anoxybacillus_kamchatkensis_3603
## 3 Anoxybacillus_tepidamans_3605
## 4 Geobacillus_caldoxylosilyticus_3606
## 5 Geobacillus_sp_WCH70_3607
## 6 Geobacillus_sp_Y41MC1_3608
##
## $time
## Node_labels
## 1 Anoxybacillus_flavithermus_WK1_3602
## 2 Geobacillus_stearothermophilus_3619
## 3 Geobacillus_sp_G1w1_3618
## 4 Geobacillus_vulcani_3620
## 5 Geobacillus_sp_C56_T3_3630
## 6 Geobacillus_sp_Y412MC52_3631
The list contains two dataframes - one each to store the number of transfers per branch of the clado and time-resolved trees. The order of edges in these dataframes is the same as the order of node labels.
# Create dataframe in which the num. transfers at each penalty for each edge
# will be stored #
edge_dfs <- list(clado = data.frame(anoxy_geo_clado$edge), time = data.frame(anoxy_geo_time$edge))
edge_dfs <- lapply(edge_dfs, setnames, c("E1", "E2"))
edge_dfs <- lapply(edge_dfs, function(x) cbind(x, Subgroup = NA))
# For each dataframe, create columns for each penalty - fill with 0s as
# placeholders #
penalty_names <- lapply(as.list(penalties), function(x) paste0("T", x))
edge_dfs <- lapply(edge_dfs, function(x) {
for (penalty in penalty_names) {
x[eval(penalty)] <- 0
}
x
})
lapply(edge_dfs, head)
## $clado
## E1 E2 Subgroup T4 T5 T6
## 1 23 24 NA 0 0 0
## 2 24 1 NA 0 0 0
## 3 24 2 NA 0 0 0
## 4 23 25 NA 0 0 0
## 5 25 3 NA 0 0 0
## 6 25 26 NA 0 0 0
##
## $time
## E1 E2 Subgroup T4 T5 T6
## 1 28 1 NA 0 0 0
## 2 28 22 NA 0 0 0
## 3 43 28 NA 0 0 0
## 4 43 21 NA 0 0 0
## 5 31 17 NA 0 0 0
## 6 31 18 NA 0 0 0
We read in the per-penalty transfer tables - these contain all transfers into (Anoxy)Geobacillus with associated edges, i.e. two nodes that are presented in the Mowgli species tree format (e.g. 3644). Using the node label table, we can then associate the transfers with edges in our two output dataframes.
# Directory containing the per-penalty transfer files #
input_edge_dir <- "/Users/aesin/Desktop/Mowgli/Long_distance_HGT/Full/Scenarios_1_2/Receptor_edges/"
for (penalty in penalties) {
# Read in the refined receptor edge list for a particular penalty #
receptor_df <- read.table(file = paste0(input_edge_dir, "T", penalty, "_refined_receptor_edges",
file_name_add, ".txt"), header = FALSE, sep = "\t")
# Column name for this penalty #
cname <- paste0("T", penalty)
# For each transfer, we can identify the edge #
at_root = 0
for (i in 1:nrow(receptor_df)) {
row <- receptor_df[i, ]
# If one of the nodes of the edge is '3728' - this is the Anoxy/Geobacillus
# root node #
if (row[1] == 3728 || row[2] == 3728) {
at_root = at_root + 1
} else {
# Pull out row number (indication of phylogeny edge) corresponding to the
# Mowgli tree label.
edge_2_species <- grep(row[2], node_labels_dfs$clado$Node_labels)
edge_2_time <- grep(row[2], node_labels_dfs$time$Node_labels)
## Fill in the count for the species-derived tree ##
edge_dfs$clado[, cname][which(edge_dfs$clado$E2 == edge_2_species)] = edge_dfs$clado[,
cname][which(edge_dfs$clado$E2 == edge_2_species)] + 1
## Fill in the count for the time-derived tree ##
edge_dfs$time[, cname][which(edge_dfs$time$E2 == edge_2_time)] = edge_dfs$time[,
cname][which(edge_dfs$time$E2 == edge_2_time)] + 1
}
}
}
lapply(edge_dfs, head)
## $clado
## E1 E2 Subgroup T4 T5 T6
## 1 23 24 NA 23 13 9
## 2 24 1 NA 14 8 7
## 3 24 2 NA 17 10 9
## 4 23 25 NA 44 31 19
## 5 25 3 NA 37 26 15
## 6 25 26 NA 86 79 79
##
## $time
## E1 E2 Subgroup T4 T5 T6
## 1 28 1 NA 14 8 7
## 2 28 22 NA 17 10 9
## 3 43 28 NA 23 13 9
## 4 43 21 NA 37 26 15
## 5 31 17 NA 69 65 59
## 6 31 18 NA 56 55 50
Some statistics for the data so far
cat("There will be something here later")
## There will be something here later
Aliyu et al 2016 divides the Geobacillus phylogeny into a number of species subgroups (fig 3). I applied that classification to the genomes used in this analysis. Branches separating species/tips belonging to the same subgroup are considered to not have undergone substantial differentiation and purifying selection.
# Read in the table constructed based on the publication #
subspecies_groupings <- read.csv(file = "/Users/aesin/Desktop/Geo_analysis/Geo_omes/Subspecies_geo_groups.csv",
header = TRUE, sep = ",")
# Get those tips that have a fellow tip in the same subgroup. Isolate all
# the subgroups that have more than one member #
subspecies_only <- subset(subspecies_groupings, duplicated(subspecies_groupings$Group) |
duplicated(subspecies_groupings$Group, fromLast = TRUE))
unique_subgroups <- unique(subspecies_only$Group)
subspecies_only
## Species Group
## 1 Geobacillus_thermodenitrificans_NG80_2 I9
## 2 Geobacillus_sp_G11MC16 I9
## 7 Geobacillus_sp_GHH01 I2
## 8 Geobacillus_sp_WSUCF1 I2
## 9 Geobacillus_sp_C56_T3 I1
## 10 Geobacillus_sp_Y412MC61 I1
## 11 Geobacillus_sp_Y412MC52 I1
## 12 Geobacillus_kaustophilus_HTA426 I1
## 13 Geobacillus_sp_MAS1 I1
## 14 Geobacillus_thermoleovorans_CCB_US3_UF5 I1
## 17 Geobacillus_thermoglucosidasius_C56_YS93 II3
## 18 Geobacillus_sp_Y41MC1 II3
# For each group with more than one member, identify the tips. Find all the
# descendant nodes from the common ancestor of the subgroup (they are
# monophyletic). Using those nodes, identify all the edges (branches) that
# we will classify as being a subgroup branch (1). All other branches are
# seperate groups (0)
anoxy_geo_time4 <- phylo4(anoxy_geo_time)
for (group in unique_subgroups) {
tips_in_subgroup <- as.vector(subspecies_only$Species[which(subspecies_only$Group ==
group)])
descend_nodes <- as.vector(descendants(anoxy_geo_time4, MRCA(anoxy_geo_time4,
tips_in_subgroup), "all"))
for (node in descend_nodes) {
edge_dfs$time$Subgroup[which(edge_dfs$time$E2 == node)] <- 1
}
}
edge_dfs$time$Subgroup[is.na(edge_dfs$time$Subgroup)] <- 0
head(edge_dfs$time)
## E1 E2 Subgroup T4 T5 T6
## 1 28 1 0 14 8 7
## 2 28 22 0 17 10 9
## 3 43 28 0 23 13 9
## 4 43 21 0 37 26 15
## 5 31 17 1 69 65 59
## 6 31 18 1 56 55 50
# The subgroup branches are highlighted in red #
plotBranchbyTrait(anoxy_geo_time, edge_dfs$time$Subgroup, method = "edges",
title = "Subgroup branches")
# for (i in 1:nrow(edge_dfs$time)) { edgelabels(i, i, adj = c(0.5, -0.25),
# bg = 'white', frame = 'none', cex = 0.7) }
In the next set of calculations we use the average number of transfers per branch. This way we can include the transfer data for all penalties simultaneously.
For each transfer penalty, make the transfers per branch a percentage of the total transfers for that penalty. Then we can take a mean over all the penalties.
# Save a copy of the dfs with the raw numbers #
edge_raw_dfs <- edge_dfs
# Make the transfers in at each branch a fraction of the total transfers #
edge_dfs <- lapply(edge_dfs, function(x) cbind(x[, 1:3], sweep(x[, -1:-3], 2,
colSums(x[, -1:-3]), "/") * 100))
# Make a mean column - averaging across all penalties #
edge_dfs <- lapply(edge_dfs, function(x) cbind(x, mean = rowMeans(x[, -1:-3])))
Example of the dataframe made:
lapply(edge_dfs, head)
## $clado
## E1 E2 Subgroup T4 T5 T6 mean
## 1 23 24 NA 1.2827663 0.8408797 0.6632277 0.9289579
## 2 24 1 NA 0.7808143 0.5174644 0.5158438 0.6047075
## 3 24 2 NA 0.9481316 0.6468305 0.6632277 0.7527300
## 4 23 25 NA 2.4539877 2.0051746 1.4001474 1.9531033
## 5 25 3 NA 2.0635806 1.6817594 1.1053795 1.6169065
## 6 25 26 NA 4.7964306 5.1099612 5.8216654 5.2426857
##
## $time
## E1 E2 Subgroup T4 T5 T6 mean
## 1 28 1 0 0.8041356 0.530856 0.5263158 0.6204358
## 2 28 22 0 0.9764503 0.663570 0.6766917 0.7722374
## 3 43 28 0 1.3210798 0.862641 0.6766917 0.9534709
## 4 43 21 0 2.1252154 1.725282 1.1278195 1.6594390
## 5 31 17 1 3.9632395 4.313205 4.4360902 4.2375116
## 6 31 18 1 3.2165422 3.649635 3.7593985 3.5418586
Plot the fraction of all transfers per branch on the clado species tree
# Plot the tree with edges coloured by the fraction #
plotBranchbyTrait(anoxy_geo_clado, edge_dfs$clado$mean, method = "edges", legend = 5,
title = "Fraction of transfers")
# Anotate the branches with the numeric values #
for (i in 1:nrow(edge_dfs$clado)) {
entry <- edge_dfs$clado[i, ]
annotate_transfer_num(entry, i, "mean")
}
Now plot the same fractions on the time-resolved tree - note the 0s on the incongruent edges (in black)
# Plot the tree with edges coloured by the fraction #
plotBranchbyTrait(anoxy_geo_time, edge_dfs$time$mean, method = "edges", legend = 0.1,
title = "Fraction of transfers")
# Anotate the branches with the numeric values #
for (i in 1:nrow(edge_dfs$time)) {
entry <- edge_dfs$time[i, ]
annotate_transfer_num(entry, i, "mean")
}
for (i in 1:length(unmapped_edges)) {
edge <- unmapped_edges[[i]]
PlotSegmentByNodes(edge[1], edge[2], "black")
}
edge_dfs$time <- cbind(edge_dfs$time, Branch_len = anoxy_geo_time$edge.length,
Index = rownames(edge_dfs$time))
head(edge_dfs$time)
## E1 E2 Subgroup T4 T5 T6 mean Branch_len Index
## 1 28 1 0 0.8041356 0.530856 0.5263158 0.6204358 0.040005508 1
## 2 28 22 0 0.9764503 0.663570 0.6766917 0.7722374 0.038648816 2
## 3 43 28 0 1.3210798 0.862641 0.6766917 0.9534709 0.355284380 3
## 4 43 21 0 2.1252154 1.725282 1.1278195 1.6594390 0.148146545 4
## 5 31 17 1 3.9632395 4.313205 4.4360902 4.2375116 0.001683514 5
## 6 31 18 1 3.2165422 3.649635 3.7593985 3.5418586 0.001943465 6
# First find all nodes related to Anoxybacillus
anoxybacillus_tips <- grep("Anoxybacillus", anoxy_geo_time$tip.label, value = TRUE)
anoxybacillus_nodes <- as.vector(descendants(anoxy_geo_time4, MRCA(anoxy_geo_time4,
anoxybacillus_tips), "ALL"))
# Subset our list of edge_dfs to include a geo_only data frame
removed_edges <- edge_dfs$time[edge_dfs$time$E2 %in% as.character(anoxybacillus_nodes),
1:2]
edge_dfs$time_geo <- edge_dfs$time[!edge_dfs$time$E2 %in% as.character(anoxybacillus_nodes),
]
removed_edges_l <- split(removed_edges, seq(nrow(removed_edges)))
# For each of the unmapped edges, identified previously, remove that edge as
# well (one is already gone as it was Anoxybacillus!) - subset to new
# dataframe.
edge_dfs$time_geo_congruent <- edge_dfs$time_geo
for (nodes in unmapped_edges) {
removed_edges <- rbind(removed_edges, edge_dfs$time_geo_congruent[which(edge_dfs$time_geo_congruent$E1 %in%
nodes[1] & edge_dfs$time_geo_congruent$E2 %in% nodes[2]), 1:2])
edge_dfs$time_geo_congruent <- edge_dfs$time_geo_congruent[!(edge_dfs$time_geo_congruent$E1 %in%
nodes[1] & edge_dfs$time_geo_congruent$E2 %in% nodes[2]), ]
}
plot(anoxy_geo_time)
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]],
"red")))
Perform a correlation on the Geobacillus branches. Outlier identified as point 39 - corresponds to the branch that seperates two major Geobacillus subdivisions. The two subdivisions share a very different GC content in the extrant taxa.
# Write out a seperate dataframe with the subgroup column relabelled
# logically #
congruent_geo_cor_df <- edge_dfs$time_geo_congruent
congruent_geo_cor_df$Subgroup <- as.logical(congruent_geo_cor_df$Subgroup)
ggplot(congruent_geo_cor_df, aes(x = Branch_len, y = mean, label = Index, color = Subgroup)) +
geom_point(size = 3) + geom_text(aes(label = Index), hjust = 0, vjust = -1) +
xlab("Branch length") + ylab("Mean fraction of HGTs across penalties") +
scale_color_manual(values = c("blue", "red"))
This tree outlines the various conditions that are used in making the correlations between branch length and HGTs
# Get the edges of the outlier branch #
outlier_edge <- as.numeric(congruent_geo_cor_df[(congruent_geo_cor_df$Index ==
39), 1:2])
# Replot the time-resolved tree with the excluded branches highlighted in
# black, and the branches labelled according to their index value #
plotBranchbyTrait(anoxy_geo_time, edge_dfs$time$Subgroup, method = "edges",
title = "Subgroup branches")
# Label the 'removed edges' black
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]],
"black")))
# Label the outlier edge green #
invisible(PlotSegmentByNodes(outlier_edge[1], outlier_edge[2], "green"))
# Label branches according to index #
for (i in as.vector(congruent_geo_cor_df$Index)) {
edgelabels(i, as.numeric(i), adj = c(0.5, -0.25), bg = "white", frame = "none",
cex = 0.7)
}
# Add legend #
legend("bottomright", legend = c("Intergroup", "Intragroup", "Outlier", "Excluded"),
text.col = c("blue", "red", "green", "black"))
Correlations are peformed for a number of conditions - this is still using the fraction of transfers averaged over transfer penalties 4, 5, and 6.
# Create a copy of the df with outlier removed #
congruent_geo_cor_out_df <- congruent_geo_cor_df[! (congruent_geo_cor_df$Index == 39),]
# Correlations - make df and fill it with different combinations #
cor_df <- data.frame( Condition=character(),
Pearson=double(),
Spearman=double(),
Rsquared=double(),
stringsAsFactors = FALSE)
# All together - outlier kept #
cor_df <- rbind(cor_df, data.frame( Condition = "All with outlier",
Pearson = cor(congruent_geo_cor_df$Branch_len, congruent_geo_cor_df$mean, method = "pearson"),
Spearman = cor(congruent_geo_cor_df$Branch_len, congruent_geo_cor_df$mean, method = "spearman"),
Rsquared = summary(lm(congruent_geo_cor_df$Branch_len ~ congruent_geo_cor_df$mean))$r.squared))
# All together - outlier removed #
cor_df <- rbind(cor_df, data.frame( Condition = "All no outlier",
Pearson = cor(congruent_geo_cor_out_df$Branch_len, congruent_geo_cor_out_df$mean, method = "pearson"),
Spearman = cor(congruent_geo_cor_out_df$Branch_len, congruent_geo_cor_out_df$mean, method = "spearman"),
Rsquared = summary(lm(congruent_geo_cor_out_df$Branch_len ~ congruent_geo_cor_out_df$mean))$r.squared))
# Split by Subgroup #
# Intergroup #
cor_df <- rbind(cor_df, data.frame( Condition = "Intergroup no outlier",
Pearson = cor(congruent_geo_cor_out_df$Branch_len[! (congruent_geo_cor_out_df$Subgroup == TRUE)], congruent_geo_cor_out_df$mean[! (congruent_geo_cor_out_df$Subgroup == TRUE)], method = "pearson"),
Spearman = cor(congruent_geo_cor_out_df$Branch_len[! (congruent_geo_cor_out_df$Subgroup == TRUE)], congruent_geo_cor_out_df$mean[! (congruent_geo_cor_out_df$Subgroup == TRUE)], method = "spearman"),
Rsquared = summary(lm(congruent_geo_cor_out_df$Branch_len[! (congruent_geo_cor_out_df$Subgroup == TRUE)] ~ congruent_geo_cor_out_df$mean[! (congruent_geo_cor_out_df$Subgroup == TRUE)]))$r.squared))
# Intragroup #
cor_df <- rbind(cor_df, data.frame( Condition = "Intragroup no outlier",
Pearson = cor(congruent_geo_cor_out_df$Branch_len[! (congruent_geo_cor_out_df$Subgroup == FALSE)], congruent_geo_cor_out_df$mean[! (congruent_geo_cor_out_df$Subgroup == FALSE)], method = "pearson"),
Spearman = cor(congruent_geo_cor_out_df$Branch_len[! (congruent_geo_cor_out_df$Subgroup == FALSE)], congruent_geo_cor_out_df$mean[! (congruent_geo_cor_out_df$Subgroup == FALSE)], method = "spearman"),
Rsquared = summary(lm(congruent_geo_cor_out_df$Branch_len[! (congruent_geo_cor_out_df$Subgroup == FALSE)] ~ congruent_geo_cor_out_df$mean[! (congruent_geo_cor_out_df$Subgroup == FALSE)]))$r.squared))
cor_df[,2:4] <- cbind(round(cor_df$Pearson, digits = 4), round(cor_df$Spearman, digits = 4), round(cor_df$Rsquared, digits = 4))
print(cor_df)
## Condition Pearson Spearman Rsquared
## 1 All with outlier 0.3312 0.4940 0.1097
## 2 All no outlier 0.6946 0.5645 0.4825
## 3 Intergroup no outlier 0.8294 0.7912 0.6879
## 4 Intragroup no outlier 0.3892 0.4180 0.1515
In this section we go back and calculate per penalty transfers into Geobacillus (using the raw numeric values). We produce the correlations as in 4. Average transfers per branch, except independently for each penalty. Sections 2, 3, and 4 are identical except for the change in transfer penalty.
edge_raw_dfs$time <- cbind(edge_raw_dfs$time, Branch_len = anoxy_geo_time$edge.length,
Index = rownames(edge_raw_dfs$time))
head(edge_raw_dfs$time)
## E1 E2 Subgroup T4 T5 T6 Branch_len Index
## 1 28 1 0 14 8 7 0.040005508 1
## 2 28 22 0 17 10 9 0.038648816 2
## 3 43 28 0 23 13 9 0.355284380 3
## 4 43 21 0 37 26 15 0.148146545 4
## 5 31 17 1 69 65 59 0.001683514 5
## 6 31 18 1 56 55 50 0.001943465 6
# First find all nodes related to Anoxybacillus
anoxybacillus_tips <- grep("Anoxybacillus", anoxy_geo_time$tip.label, value = TRUE)
anoxybacillus_nodes <- as.vector(descendants(anoxy_geo_time4, MRCA(anoxy_geo_time4,
anoxybacillus_tips), "ALL"))
# Subset our list of edge_dfs to include a geo_only data frame
removed_edges <- edge_raw_dfs$time[edge_raw_dfs$time$E2 %in% as.character(anoxybacillus_nodes),
1:2]
edge_raw_dfs$time_geo <- edge_raw_dfs$time[!edge_raw_dfs$time$E2 %in% as.character(anoxybacillus_nodes),
]
removed_edges_l <- split(removed_edges, seq(nrow(removed_edges)))
# For each of the unmapped edges, identified previously, remove that edge as
# well (one is already gone as it was Anoxybacillus!) - subset to new
# dataframe.
edge_raw_dfs$time_geo_congruent <- edge_raw_dfs$time_geo
for (nodes in unmapped_edges) {
removed_edges <- rbind(removed_edges, edge_raw_dfs$time_geo_congruent[which(edge_raw_dfs$time_geo_congruent$E1 %in%
nodes[1] & edge_raw_dfs$time_geo_congruent$E2 %in% nodes[2]), 1:2])
edge_raw_dfs$time_geo_congruent <- edge_raw_dfs$time_geo_congruent[!(edge_raw_dfs$time_geo_congruent$E1 %in%
nodes[1] & edge_raw_dfs$time_geo_congruent$E2 %in% nodes[2]), ]
}
# Extract the time_geo_congruent as a separate dataframe, and also make a
# copy without the outlier point #
congruent_geo_cor_raw_df <- edge_raw_dfs$time_geo_congruent
plot(anoxy_geo_time)
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]],
"red")))
Tree coloured on a scale according to the number of transfers at each branch. Numbers above each represent number of transfer events. In black, branches which are excluded from the correlations.
# Plot the tree with edges coloured by the fraction #
plotBranchbyTrait(anoxy_geo_time, edge_raw_dfs$time$T4, method = "edges", legend = 0.25,
title = "Number of transfers")
# Anotate the branches with the numeric values #
for (i in 1:nrow(edge_raw_dfs$time)) {
entry <- edge_raw_dfs$time[i, ]
annotate_transfer_num(entry, i, "T4")
}
# Colour the trimmed edges black #
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]],
"black")))
Plot the branch lengths vs raw number of transfers at branch
# Make the subgrouping call logical rather than binary #
congruent_geo_cor_raw_df$Subgroup <- as.logical(congruent_geo_cor_df$Subgroup)
# Plot the scatter plot
ggplot(congruent_geo_cor_raw_df, aes(x = Branch_len, y = T4, label = Index,
color = Subgroup)) + geom_point(size = 3) + geom_text(aes(label = Index),
hjust = 0, vjust = -1) + xlab("Branch length") + ylab("Raw number of HGTs at penalty = 4") +
scale_color_manual(values = c("blue", "red"))
Correlation calculations as in 4. Branch length correlation (2)
T4_cor_df <- CalcCorrLengthTransfers(congruent_geo_cor_raw_df, "T4", 39)
print(T4_cor_df)
## Condition Pearson Spearman Rsquared
## 1 All with outlier 0.3332 0.5246 0.1110
## 2 All no outlier 0.7169 0.6040 0.5139
## 3 Intergroup no outlier 0.8174 0.7925 0.6682
## 4 Intragroup no outlier 0.3840 0.4521 0.1475
Tree coloured on a scale according to the number of transfers at each branch. Numbers above each represent number of transfer events. In black, branches which are excluded from the correlations.
# Plot the tree with edges coloured by the fraction #
plotBranchbyTrait(anoxy_geo_time, edge_raw_dfs$time$T5, method = "edges", legend = 0.25,
title = "Number of transfers")
# Anotate the branches with the numeric values #
for (i in 1:nrow(edge_raw_dfs$time)) {
entry <- edge_raw_dfs$time[i, ]
annotate_transfer_num(entry, i, "T5")
}
# Colour the trimmed edges black #
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]],
"black")))
Plot the branch lengths vs raw number of transfers at branch
# Make the subgrouping call logical rather than binary #
congruent_geo_cor_raw_df$Subgroup <- as.logical(congruent_geo_cor_df$Subgroup)
# Plot the scatter plot
ggplot(congruent_geo_cor_raw_df, aes(x = Branch_len, y = T5, label = Index,
color = Subgroup)) + geom_point(size = 3) + geom_text(aes(label = Index),
hjust = 0, vjust = -1) + xlab("Branch length") + ylab("Raw number of HGTs at penalty = 5") +
scale_color_manual(values = c("blue", "red"))
Correlation calculations as in 4. Branch length correlation (2)
T5_cor_df <- CalcCorrLengthTransfers(congruent_geo_cor_raw_df, "T5", 39)
print(T5_cor_df)
## Condition Pearson Spearman Rsquared
## 1 All with outlier 0.3143 0.4833 0.0988
## 2 All no outlier 0.6948 0.5632 0.4827
## 3 Intergroup no outlier 0.8370 0.7995 0.7006
## 4 Intragroup no outlier 0.4002 0.4180 0.1601
Tree coloured on a scale according to the number of transfers at each branch. Numbers above each represent number of transfer events. In black, branches which are excluded from the correlations.
# Plot the tree with edges coloured by the fraction #
plotBranchbyTrait(anoxy_geo_time, edge_raw_dfs$time$T6, method = "edges", legend = 0.25,
title = "Number of transfers")
# Anotate the branches with the numeric values #
for (i in 1:nrow(edge_raw_dfs$time)) {
entry <- edge_raw_dfs$time[i, ]
annotate_transfer_num(entry, i, "T6")
}
# Colour the trimmed edges black #
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]],
"black")))
Plot the branch lengths vs raw number of transfers at branch
# Make the subgrouping call logical rather than binary #
congruent_geo_cor_raw_df$Subgroup <- as.logical(congruent_geo_cor_df$Subgroup)
# Plot the scatter plot
ggplot(congruent_geo_cor_raw_df, aes(x = Branch_len, y = T6, label = Index,
color = Subgroup)) + geom_point(size = 3) + geom_text(aes(label = Index),
hjust = 0, vjust = -1) + xlab("Branch length") + ylab("Raw number of HGTs at penalty = 6") +
scale_color_manual(values = c("blue", "red"))
Correlation calculations as in 4. Branch length correlation (2)
T6_cor_df <- CalcCorrLengthTransfers(congruent_geo_cor_raw_df, "T6", 39)
print(T6_cor_df)
## Condition Pearson Spearman Rsquared
## 1 All with outlier 0.3415 0.4994 0.1166
## 2 All no outlier 0.6658 0.5420 0.4433
## 3 Intergroup no outlier 0.8237 0.7891 0.6785
## 4 Intragroup no outlier 0.3815 0.4006 0.1456
added_penalty_col <- lapply(penalty_names, function(x) {
df <- eval(as.name(paste0(x, "_cor_df")))
penalty <- str_sub(x, 2, 2)
new_col <- data.frame(Penalty = rep(penalty, nrow(df)))
df <- cbind(df, new_col)
})
total_cor_df <- as.data.frame(rbindlist(added_penalty_col))
print(total_cor_df)
## Condition Pearson Spearman Rsquared Penalty
## 1 All with outlier 0.3332 0.5246 0.1110 4
## 2 All no outlier 0.7169 0.6040 0.5139 4
## 3 Intergroup no outlier 0.8174 0.7925 0.6682 4
## 4 Intragroup no outlier 0.3840 0.4521 0.1475 4
## 5 All with outlier 0.3143 0.4833 0.0988 5
## 6 All no outlier 0.6948 0.5632 0.4827 5
## 7 Intergroup no outlier 0.8370 0.7995 0.7006 5
## 8 Intragroup no outlier 0.4002 0.4180 0.1601 5
## 9 All with outlier 0.3415 0.4994 0.1166 6
## 10 All no outlier 0.6658 0.5420 0.4433 6
## 11 Intergroup no outlier 0.8237 0.7891 0.6785 6
## 12 Intragroup no outlier 0.3815 0.4006 0.1456 6
total_cor_df.molten <- melt(total_cor_df, measure.vars = c("Pearson", "Spearman",
"Rsquared"), variable.name = "Correlation", value.name = "Score")
## Replot the tree to show the various branch identities ##
plotBranchbyTrait(anoxy_geo_time, edge_dfs$time$Subgroup, method = "edges",
title = "Subgroup branches")
# Label the 'removed edges' black
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]],
"black")))
# Label the outlier edge green #
invisible(PlotSegmentByNodes(outlier_edge[1], outlier_edge[2], "green"))
# Label branches according to index #
for (i in as.vector(congruent_geo_cor_df$Index)) {
edgelabels(i, as.numeric(i), adj = c(0.5, -0.25), bg = "white", frame = "none",
cex = 0.7)
}
# Add legend #
legend("bottomright", legend = c("Intergroup", "Intragroup", "Outlier", "Excluded"),
text.col = c("blue", "red", "green", "black"))
# Plot the correlations #
ggplot(total_cor_df.molten, aes(x = Condition, y = Score, variable = Penalty,
colour = Penalty)) + geom_point(size = 3) + facet_wrap("Correlation", nrow = 1) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12), axis.text.y = element_text(size = 12),
strip.text.x = element_text(size = 12), axis.title = element_text(size = 12)) +
scale_y_continuous(limits = c(0, 1))